# -*- coding: utf-8 -*-
import os
import random
import threading
from threading import Thread
import string
from urllib.parse import quote

import requests
from requests.exceptions import ProxyError, Timeout, ConnectTimeout, ReadTimeout, HTTPError
from urllib3.exceptions import NewConnectionError
from zc_core.client.mongo_client import Mongo
from zc_core.middlewares.proxies.cached_pool import CachedProxyPool
from zc_core.middlewares.proxies.wandou_pool import WandouProxyPool
from zc_core.middlewares.proxies.zhima_pool import ZhimaProxyPool
from zc_core.util import file_reader

threadmax = threading.BoundedSemaphore(8)
proxy_pool = ZhimaProxyPool()
mongo = Mongo()


class ImageDownloader(Thread):

    def __init__(self, url, file):
        Thread.__init__(self)
        self.url = quote(url , safe=string.printable)
        self.file = file

    def run(self):
        if not os.path.exists(self.file):
            # proxy = proxy_pool.get_proxy()
            try:
                # http_proxy = 'http://' + proxy
                r = requests.get(
                    url=self.url,
                    timeout=30,
                    # proxies={'http': http_proxy},
                )
                if r and r.status_code == 200:
                    with open(self.file, 'wb') as file:
                        file.write(r.content)
                    print('图片：%s' % self.file)
                else:
                    print('响应异常<url=%s>' % self.url)
            except (
                    ProxyError, NewConnectionError, Timeout, ConnectTimeout, ReadTimeout, ConnectionError,
                    HTTPError) as e:
                # proxy_pool.remove_proxy(proxy)
                # print('移除代理：proxy=%s, e=%s' % (proxy, e))
                print('移除代理：e=%s' % e)
            except Exception as e:
                print('线程异常：name=%s, url=%s, e=%s' % (self.file, self.url, e))
            finally:
                threadmax.release()
        else:
            print('已下载2：%s' % self.file)


class MultiCollector(object):
    def __init__(self, src_db, base_dir):
        self.thread_pool = list()
        self.src_db = src_db
        self.base_dir = base_dir
        rows = file_reader.read_rows('../doc/filter.txt')
        self.filter = set([x for x in rows])

    def start(self):
        items = mongo.list(self.src_db, fields={'_id': 1, 'mainImgs': 1, 'detailImgs': 1, 'certs': 1}, query={})
        random.shuffle(items)
        for item in items:
            sku_id = item.get('_id')
            # 过滤
            if sku_id not in self.filter:
                continue
            main_imgs = item.get('mainImgs', [])
            dt_imgs = item.get('detailImgs', [])
            certs = item.get('certs', [])

            if main_imgs:
                self.download(sku_id, main_imgs, 'main')
            if dt_imgs:
                self.download(sku_id, dt_imgs, 'detail')
            # if certs:
            #     self.download(sku_id, certs, 'cert')

        for t in self.thread_pool:
            t.join()
        print('任务完成')

    def download(self, sku_id, main_imgs, type_dir):
        out_file_dir = f'{self.base_dir}/{type_dir}/{sku_id}'
        if not os.path.exists(out_file_dir):
            os.makedirs(out_file_dir)
        for idx, url in enumerate(main_imgs):
            out_file_name = f'{out_file_dir}/{sku_id}_{idx + 1}.jpg'
            if not os.path.exists(out_file_name):
                threadmax.acquire()
                downloader = ImageDownloader(url, out_file_name)
                self.thread_pool.append(downloader)
                downloader.start()
            else:
                print(f'存在[{type_dir}]：{out_file_name}')


# 测试
if __name__ == '__main__':
    src_db = 'data_20210701'
    base_dir = 'E:/work/suzhou/20210701'
    if not os.path.exists(base_dir):
        os.makedirs(base_dir)

    mc = MultiCollector(src_db, base_dir)
    mc.start()
